# 20180715
# author：葛木瓜
# 从西安房管局意向登记平台爬取已完成的登记信息
# 长安区登记平台
# http://124.115.228.93/zfrgdjpt/xmgsca.aspx?state=4
# 非长安区登记平台
# http://124.115.228.93/zfrgdjpt/xmgs.aspx?state=4


from byCsv import write_registered
from bs4 import BeautifulSoup
from urllib import request
import time
import os
import re

csv_fp = './csvData/registered.csv'
url1 = 'http://124.115.228.93/zfrgdjpt/xmgs.aspx?state=4'
url2 = 'http://124.115.228.93/zfrgdjpt/xmgsca.aspx?state=4'


def get_soup(url):
    """
    将页面用BeautifulSoup库处理
    :return:
    """
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0"
    }
    req = request.Request(url, headers=header)
    html = request.urlopen(req).read().decode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def get_page_url(url):
    """
    从登记结束页面爬取所有分页url，存入列表中
    :param url:
    :return:
    """
    page_url = []
    paging = get_soup(url).find_all(href=re.compile('page'))
    if paging is not []:
        for paging_i in range(len(paging)-1):
            page_url.append(url.split('xmgs')[0] + paging[paging_i]['href'])

    return page_url


def get_result_url(flag, url):
    """
    从登记结束页面爬取所有的结果公示url
    :return:
    """
    result_url = []
    result_list = get_soup(url).find_all('span', string='结果公示')
    if flag == 33:
        for result_list_i in result_list:
            result_url.append(url.split('xmgs')[0] + result_list_i['onclick'].split("='")[1].split("'")[0])
        if get_page_url(url) is not []:
            for page_i in get_page_url(url):
                result_list = get_soup(page_i).find_all('span', string='结果公示')
                for result_list_i in result_list:
                    result_url.append(url.split('xmgs')[0] + result_list_i['onclick'].split("='")[1].split("'")[0])
    else:
        for result_list_i in result_list[0: flag]:
            result_url.append(url.split('xmgs')[0] + result_list_i['onclick'].split("='")[1].split("'")[0])

    return result_url


def get_regis_data(flag, result_url):
    """
    从结果登记页面获取每个楼盘登记的数据
    :return:
    """
    for result_url_i in result_url:
        regis_data = []
        gs = get_soup(result_url_i)
        house_name = gs.find(string=re.compile('意向登记结果公示')).split(' 意向')[0].strip()
        regis_data.append(house_name)
        all_data = gs.find_all('font')
        for data_i in all_data:
            regis_data.append(data_i.string.strip())
        if flag == 33:
            write_registered(regis_data)
        else:
            return regis_data


def get_all_registered():

    flag = 33
    if os.path.exists(csv_fp):
        os.remove(csv_fp)
    lst = [url1, url2]
    write_registered(['项目名', '房源数', '登记数', '资料核验数', '核验通过数', '刚需数', '普通数', '未通过数'])
    for lst_i in range(len(lst)):
        print('Crawling %s data ~~~' % ['clq', 'caq'][lst_i])
        get_regis_data(flag, get_result_url(flag, lst[lst_i]))


def get_partial_registered():

    flag = 1
    registered = []
    text = '【最新结束项目❗❗】\n\n'
    lst = [url1, url2]
    # write_csv(csv_fp, ['项目名', '房源数', '登记数', '资料核验数', '核验通过数', '刚需数', '普通数', '未通过数'])
    for lst_i in range(len(lst)):
        print('Crawling %s data ~~~' % ['clq', 'caq'][lst_i])
        registered.append(get_regis_data(flag, get_result_url(flag, lst[lst_i])))
    # print(registered)
    for registered_i in registered:
        if len(registered_i) == 3:
            text += '项目名：%s\n' % registered_i[0]
            text += '→ 房源数：%s\n' % registered_i[1]
            text += '→ 登记数：%s\n\n' % registered_i[2]
        else:
            text += '项目名：%s\n' % registered_i[0]
            text += '→ 房源数：%s\n' % registered_i[1]
            text += '→ 登记数：%s\n' % registered_i[2]
            text += '→ 资料核验数：%s\n' % registered_i[3]
            text += '→ 核验通过数：%s\n' % registered_i[4]
            text += '→ 刚需数：%s\n' % registered_i[5]
            text += '→ 普通数：%s\n' % registered_i[6]
            text += '→ 未通过数：%s\n\n' % registered_i[7]
    return text


if __name__ == '__main__':

    get_all_registered()
    # txt = get_partial_registered()
    # print(txt)

